In [1]:
import os
import warnings
from dotenv import load_dotenv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.exceptions import UndefinedMetricWarning
from evidently import Report, Dataset, DataDefinition, Regression
from evidently.presets import DataDriftPreset, RegressionPreset
from evidently.ui.workspace import CloudWorkspace
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
In [2]:
# Set up Evidently cloud
load_dotenv()
org_id = os.getenv("EVIDENTLY_ORG_ID")
api_key = os.getenv("EVIDENTLY_API_KEY")
project_id = os.getenv("EVIDENTLY_PROJECT_ID")
ws = CloudWorkspace(token=api_key, url="https://app.evidently.cloud")
project = ws.get_project(project_id)
In [3]:
df = pd.read_csv('cancer_reg.csv', encoding='latin1')
print(df.shape)
df.head()
(3047, 34)
Out[3]:
| avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | MedianAgeMale | MedianAgeFemale | Geography | AvgHouseholdSize | PercentMarried | PctNoHS18_24 | PctHS18_24 | PctSomeCol18_24 | PctBachDeg18_24 | PctHS25_Over | PctBachDeg25_Over | PctEmployed16_Over | PctUnemployed16_Over | PctPrivateCoverage | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1397.0 | 469 | 164.9 | 489.8 | 61898 | 260131 | 11.2 | 499.748204 | (61494.5, 125635] | 39.3 | 36.9 | 41.7 | Kitsap County, Washington | 2.54 | 52.5 | 11.5 | 39.5 | 42.1 | 6.9 | 23.2 | 19.6 | 51.9 | 8.0 | 75.1 | NaN | 41.6 | 32.9 | 14.0 | 81.780529 | 2.594728 | 4.821857 | 1.843479 | 52.856076 | 6.118831 |
| 1 | 173.0 | 70 | 161.3 | 411.6 | 48127 | 43269 | 18.6 | 23.111234 | (48021.6, 51046.4] | 33.0 | 32.2 | 33.7 | Kittitas County, Washington | 2.34 | 44.5 | 6.1 | 22.4 | 64.0 | 7.5 | 26.0 | 22.7 | 55.9 | 7.8 | 70.2 | 53.8 | 43.6 | 31.1 | 15.3 | 89.228509 | 0.969102 | 2.246233 | 3.741352 | 45.372500 | 4.333096 |
| 2 | 102.0 | 50 | 174.7 | 349.7 | 49348 | 21026 | 14.6 | 47.560164 | (48021.6, 51046.4] | 45.0 | 44.0 | 45.8 | Klickitat County, Washington | 2.62 | 54.2 | 24.0 | 36.6 | NaN | 9.5 | 29.0 | 16.0 | 45.9 | 7.0 | 63.7 | 43.5 | 34.9 | 42.1 | 21.1 | 90.922190 | 0.739673 | 0.465898 | 2.747358 | 54.444868 | 3.729488 |
| 3 | 427.0 | 202 | 194.8 | 430.4 | 44243 | 75882 | 17.1 | 342.637253 | (42724.4, 45201] | 42.8 | 42.2 | 43.4 | Lewis County, Washington | 2.52 | 52.7 | 20.2 | 41.2 | 36.1 | 2.5 | 31.6 | 9.3 | 48.3 | 12.1 | 58.4 | 40.3 | 35.0 | 45.3 | 25.0 | 91.744686 | 0.782626 | 1.161359 | 1.362643 | 51.021514 | 4.603841 |
| 4 | 57.0 | 26 | 144.4 | 350.1 | 49955 | 10321 | 12.5 | 0.000000 | (48021.6, 51046.4] | 48.3 | 47.8 | 48.9 | Lincoln County, Washington | 2.34 | 57.8 | 14.9 | 43.0 | 40.0 | 2.0 | 33.4 | 15.0 | 48.2 | 4.8 | 61.6 | 43.9 | 35.1 | 44.0 | 22.7 | 94.104024 | 0.270192 | 0.665830 | 0.492135 | 54.027460 | 6.796657 |
In [4]:
# Preprocess data
target = "TARGET_deathRate"
features = [c for c in df.columns if c != target and c not in ["Geography", "binnedInc"]]
# Split data
train, test = train_test_split(df, test_size=0.2, random_state=42)
X_train, y_train = train[features], train[target]
X_test_orig, y_test_orig = test[features], test[target]
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Evaluate on original test set
pred_test_orig = model.predict(X_test_orig)
import numpy as np
print("Original Test RMSE:", np.sqrt(mean_squared_error(y_test_orig, pred_test_orig)))
print("Original Test R²:", r2_score(y_test_orig, pred_test_orig))
Original Test RMSE: 19.141634432544688 Original Test R²: 0.5522098578423327
In [5]:
# Helper to evaluate and report results
def evaluate_and_report(name, X_cur, y_true):
preds = model.predict(X_cur)
ref = train[features].copy()
ref[target] = train[target]
ref["prediction"] = model.predict(train[features])
cur = X_cur.copy()
cur[target] = y_true
cur["prediction"] = preds
data_def = DataDefinition(
regression=[Regression(target=target, prediction="prediction")]
)
ref_ds = Dataset.from_pandas(ref, data_definition=data_def)
cur_ds = Dataset.from_pandas(cur, data_definition=data_def)
rmse = np.sqrt(mean_squared_error(y_true, preds))
r2 = r2_score(y_true, preds) if len(y_true) >= 2 else float("nan")
print(f"\n== {name} ==")
print("RMSE:", rmse, "R²:", r2)
# Run evaluation
report = Report(metrics=[DataDriftPreset(), RegressionPreset()])
eval_result = report.run(reference_data=ref_ds, current_data=cur_ds)
# Upload to Cloud
ws.add_run(project.id, eval_result, include_data=False)
return eval_result
evaluate_and_report("Baseline (Original Test)", X_test_orig, y_test_orig)
== Baseline (Original Test) == RMSE: 19.141634432544688 R²: 0.5522098578423327
Out[5]:
In [6]:
# Create Modified Scenarios (A, A+B, A+B+C)
# Scenario A: Decrease median income by 40k
test_A = X_test_orig.copy()
test_A["medIncome"] = test_A["medIncome"] - 40000
# Scenario A+B: Also increase povertyPercent by 20
test_AB = test_A.copy()
test_AB["povertyPercent"] = test_AB["povertyPercent"] + 20
# Scenario A+B+C: Also increase AvgHouseholdSize by 2
test_ABC = test_AB.copy()
test_ABC["AvgHouseholdSize"] = test_ABC["AvgHouseholdSize"] + 2
In [7]:
evaluate_and_report("Scenario A (Income ↓40k)", test_A, y_test_orig)
== Scenario A (Income ↓40k) == RMSE: 21.5767213114231 R²: 0.43103258174724723
Out[7]:
In [8]:
evaluate_and_report("Scenario A+B (Income ↓, Poverty ↑)", test_AB, y_test_orig)
== Scenario A+B (Income ↓, Poverty ↑) == RMSE: 23.02972891944842 R²: 0.35182221121125634
Out[8]:
In [9]:
evaluate_and_report("Scenario A+B+C (Income ↓, Poverty ↑, HH Size ↑)", test_ABC, y_test_orig)
== Scenario A+B+C (Income ↓, Poverty ↑, HH Size ↑) == RMSE: 22.529505067326458 R²: 0.37967426997046405
Out[9]:
In [ ]: